Get your census API key: https://api.census.gov/data/key_signup.html
Configure environment:
library(tidycensus)
library(tidyverse)
library(sf)
library(tigris)
#library(lwgeom)
library(ggmap)
library(janitor)
theme_set(theme_bw())
options(tigris_use_cache = TRUE,
scipen = 4,
digits = 3)
Before we dive in, this presentation assumes that the user has basic familiarity with tidyverse, mainly dplyr. Knowing how to use %>% will be very helpful.
tidycensus gives access to census API and makes it easy to plot data on a map.
Data
simple features makes it easy to work with polygon data in R. It uses familiar the tidyverse framework: everything is a tibble, and it uses %>%.
Uses Google Maps API to get basemaps.
This loads the ACS variables for 2017:
variables_acs <- load_variables(year = 2017, dataset = "acs5", cache = TRUE)
| name | label | concept |
|---|---|---|
| B00001_001 | Estimate!!Total | UNWEIGHTED SAMPLE COUNT OF THE POPULATION |
| B00002_001 | Estimate!!Total | UNWEIGHTED SAMPLE HOUSING UNITS |
| B01001_001 | Estimate!!Total | SEX BY AGE |
| B01001_002 | Estimate!!Total!!Male | SEX BY AGE |
| B01001_003 | Estimate!!Total!!Male!!Under 5 years | SEX BY AGE |
This loads the variables from the Decennial Census in 2010:
variables_dec <- load_variables(year = 2010, dataset = "sf1", cache = TRUE)
| name | label | concept |
|---|---|---|
| H001001 | Total | HOUSING UNITS |
| H002001 | Total | URBAN AND RURAL |
| H002002 | Total!!Urban | URBAN AND RURAL |
| H002003 | Total!!Urban!!Inside urbanized areas | URBAN AND RURAL |
| H002004 | Total!!Urban!!Inside urban clusters | URBAN AND RURAL |
Query the total population of the continental U.S. states:
states <- get_decennial(geography = "state",
variables = c(total_pop = "P001001"),
geometry = TRUE,
output = "wide")
The states tibble contains the census data and the polygons for the geometries.
## Simple feature collection with 52 features and 3 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: -179 ymin: 17.9 xmax: 180 ymax: 71.4
## epsg (SRID): 4269
## proj4string: +proj=longlat +datum=NAD83 +no_defs
## # A tibble: 52 x 4
## GEOID NAME total_pop geometry
## <chr> <chr> <dbl> <MULTIPOLYGON [°]>
## 1 01 Alabama 4779736 (((-85 31, -85 31, -85 31, -85 31, -85 31, -85…
## 2 02 Alaska 710231 (((-165 54.1, -165 54.1, -165 54.1, -165 54.1,…
## 3 04 Arizona 6392017 (((-109 37, -109 37, -109 37, -109 36.9, -109 …
## 4 05 Arkansas 2915918 (((-94.6 36.5, -94.5 36.5, -94.4 36.5, -94.1 3…
## 5 06 Califor… 37253956 (((-122 37.9, -122 37.9, -122 37.9, -122 37.9,…
## 6 22 Louisia… 4533372 (((-88.9 29.8, -88.9 29.7, -88.9 29.7, -88.9 2…
## 7 21 Kentucky 4339367 (((-83.7 36.6, -83.7 36.6, -83.7 36.6, -83.7 3…
## 8 08 Colorado 5029196 (((-102 37, -102 37, -102 37, -102 37, -102 37…
## 9 09 Connect… 3574097 (((-71.9 41.3, -71.9 41.3, -71.9 41.3, -71.9 4…
## 10 10 Delaware 897934 (((-75.6 39.6, -75.6 39.6, -75.6 39.6, -75.6 3…
## # … with 42 more rows
Plot the data:
states %>%
filter(NAME != "Alaska",
NAME != "Hawaii",
!str_detect(NAME, "Puerto")) %>%
ggplot(aes(fill = total_pop)) +
geom_sf(color = NA) +
scale_fill_viridis_c("Total Population")
Pull the total population of each county in PA and plot it:
pennsylvania <- get_decennial(geography = "county",
variables = c(total_pop = "P001001"),
state = "PA",
geometry = TRUE,
output = "wide")
pennsylvania %>%
ggplot(aes(fill = total_pop)) +
geom_sf(color = NA) +
scale_fill_viridis_c()
We can stack multiple polygons in the same graph to highlight Allegheny County:
allegheny <- pennsylvania %>%
filter(str_detect(NAME, "Allegheny"))
pennsylvania %>%
ggplot() +
geom_sf(aes(fill = total_pop), color = NA) +
geom_sf(data = allegheny, color = "white", linetype = 2, size = 1, alpha = 0) +
scale_fill_viridis_c()
don’t use load census block data for total pop
don’t use graph
We can also use tidycensus to download demographic data for census tracts.
Set the variables we want to use first:
racevars <- c(White = "P005003",
Black = "P005004",
Asian = "P005006",
Hispanic = "P004003")
#note that this data is long, not wide
allegheny <- get_decennial(geography = "tract", variables = racevars,
state = "PA", county = "Allegheny County", geometry = TRUE,
summary_var = "P001001")
Calculate as a percentage of tract population:
allegheny <- allegheny %>%
mutate(pct = 100 * value / summary_value)
Facet by variable and map the data:
allegheny %>%
ggplot(aes(fill = pct)) +
geom_sf(color = NA) +
facet_wrap(~variable) +
scale_fill_viridis_c()
make rivers show up
st_erase <- function(x, y) {
st_difference(x, st_union(st_combine(y)))
}
allegheny_water <- area_water("PA", "Allegheny County", class = "sf")
allegheny %>%
ggplot() +
geom_sf() +
geom_sf(data = allegheny_water, fill = "blue")
#allegheny_erase <- st_erase(allegheny, allegheny_water)
We can overlay the boundaries of Pittsburgh over the same graph.
Download the boundary shapefile and use sf::st_read to read it into R:
city_pgh <- st_read("data/Pittsburgh_City_Boundary/Pittsburgh_City_Boundary.shp")
## Reading layer `Pittsburgh_City_Boundary' from data source `/cloud/project/data/Pittsburgh_City_Boundary/Pittsburgh_City_Boundary.shp' using driver `ESRI Shapefile'
## Simple feature collection with 8 features and 6 fields
## geometry type: POLYGON
## dimension: XY
## bbox: xmin: -80.1 ymin: 40.4 xmax: -79.9 ymax: 40.5
## epsg (SRID): 4326
## proj4string: +proj=longlat +datum=WGS84 +no_defs
allegheny %>%
ggplot() +
geom_sf(aes(fill = pct), color = NA) +
geom_sf(data = city_pgh, color = "white", line_type = 2, size = 1, alpha = 0) +
facet_wrap(~variable) +
scale_fill_viridis_c()
WPRDC fire data
We can also download the shapefile for the City of Pittsburgh wards. The 311 dataset is tagged with the ward the request originated from, so we can use that to aggregate and map the total number of 311 requests per ward.
df_311 <- read_csv("https://data.wprdc.org/datastore/dump/76fda9d0-69be-4dd5-8108-0de7907fc5a4")
| REQUEST_ID | CREATED_ON | REQUEST_TYPE | REQUEST_ORIGIN | STATUS | DEPARTMENT | NEIGHBORHOOD | COUNCIL_DISTRICT | WARD | TRACT | PUBLIC_WORKS_DIVISION | PLI_DIVISION | POLICE_ZONE | FIRE_ZONE | X | Y | GEO_ACCURACY |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 203364 | 2017-12-15 14:53:00 | Street Obstruction/Closure | Call Center | 1 | DOMI - Permits | Central Northside | 1 | 22 | 42003220600 | 1 | 22 | 1 | 1-7 | -80.0 | 40.5 | EXACT |
| 200800 | 2017-11-29 09:54:00 | Graffiti | Control Panel | 1 | Police - Zones 1-6 | South Side Flats | 3 | 16 | 42003160900 | 3 | 16 | 3 | 4-24 | -80.0 | 40.4 | APPROXIMATE |
| 201310 | 2017-12-01 13:23:00 | Litter | Call Center | 1 | DPW - Street Maintenance | Troy Hill | 1 | 24 | 42003240600 | 1 | 24 | 1 | 1-2 | -80.0 | 40.5 | EXACT |
| 200171 | 2017-11-22 14:54:00 | Water Main Break | Call Center | 0 | Pittsburgh Water and Sewer Authority | Banksville | 2 | 20 | 42003202300 | 5 | 20 | 6 | 4-9 | -80.0 | 40.4 | EXACT |
| 193043 | 2017-10-12 12:46:00 | Guide Rail | Call Center | 1 | DPW - Construction Division | East Hills | 9 | 13 | 42003130600 | 2 | 13 | 5 | 3-19 | -79.9 | 40.5 | EXACT |
| 196521 | 2017-10-31 15:17:00 | Guide Rail | Call Center | 1 | DPW - Construction Division | East Hills | 9 | 13 | 42003130600 | 2 | 13 | 5 | 3-19 | -79.9 | 40.5 | EXACT |
| 193206 | 2017-10-13 09:18:00 | Curb /Broken/Deteriorated | Call Center | 1 | DOMI - Permits | Mount Washington | 2 | 19 | 42003191500 | 5 | 19 | 3 | 4-27 | -80.0 | 40.4 | EXACT |
| 195917 | 2017-10-27 10:23:00 | Manhole Cover | Call Center | 0 | DOMI - Permits | Bluff | 6 | 1 | 42003010300 | 3 | 1 | 2 | 2-4 | -80.0 | 40.4 | EXACT |
| 179176 | 2017-08-14 14:00:00 | Neighborhood Issues | Control Panel | 0 | NA | Middle Hill | 6 | 5 | 42003050100 | 3 | 5 | 2 | 2-1 | -80.0 | 40.4 | APPROXIMATE |
| 190422 | 2017-09-29 11:46:00 | Mayor’s Office | Website | 1 | 311 | North Oakland | 8 | 4 | 42003040400 | 3 | 4 | 4 | 2-9 | -80.0 | 40.4 | APPROXIMATE |
wards <- st_read("data/PGHWards/PGHWards.shp")
## Reading layer `PGHWards' from data source `/cloud/project/data/PGHWards/PGHWards.shp' using driver `ESRI Shapefile'
## Simple feature collection with 35 features and 7 fields
## geometry type: POLYGON
## dimension: XY
## bbox: xmin: -80.1 ymin: 40.4 xmax: -79.9 ymax: 40.5
## epsg (SRID): 4326
## proj4string: +proj=longlat +datum=WGS84 +no_defs
## Simple feature collection with 35 features and 7 fields
## geometry type: POLYGON
## dimension: XY
## bbox: xmin: -80.1 ymin: 40.4 xmax: -79.9 ymax: 40.5
## epsg (SRID): 4326
## proj4string: +proj=longlat +datum=WGS84 +no_defs
## First 10 features:
## objectid wards_ wards_id ward wardtext Shape__Are Shape__Len
## 1 2 3 0 19 19 0.0011446 0.2389
## 2 3 4 0 24 24 0.0001901 0.0734
## 3 4 5 0 24 24 0.0000210 0.0274
## 4 5 6 0 20 20 0.0011755 0.2888
## 5 6 7 0 21 21 0.0002074 0.0619
## 6 7 8 0 23 23 0.0001031 0.0480
## 7 8 9 0 22 22 0.0001967 0.0593
## 8 9 10 0 26 26 0.0009079 0.1915
## 9 10 11 0 27 27 0.0005849 0.1173
## 10 11 12 0 27 27 0.0000634 0.0400
## geometry
## 1 POLYGON ((-80 40.4, -80 40....
## 2 POLYGON ((-80 40.5, -80 40....
## 3 POLYGON ((-80 40.5, -80 40....
## 4 POLYGON ((-80 40.4, -80 40....
## 5 POLYGON ((-80 40.5, -80 40....
## 6 POLYGON ((-80 40.5, -80 40....
## 7 POLYGON ((-80 40.5, -80 40....
## 8 POLYGON ((-80 40.5, -80 40....
## 9 POLYGON ((-80 40.5, -80 40....
## 10 POLYGON ((-80 40.5, -80 40....
Plot the ward polygons:
wards %>%
ggplot() +
geom_sf()
Calculate the center of each ward. We will use this to label the wards on the map:
ward_labels <- wards %>%
st_centroid() %>%
st_coordinates() %>%
as_tibble() %>%
clean_names()
| x | y |
|---|---|
| -80 | 40.4 |
| -80 | 40.5 |
| -80 | 40.5 |
| -80 | 40.4 |
| -80 | 40.5 |
Count the number of requests per ward:
df_311_count <- df_311 %>%
count(WARD, sort = TRUE)
Join the count data with the coordinates:
ward_311 <- wards %>%
left_join(df_311_count, by = c("ward" = "WARD")) %>%
bind_cols(ward_labels)
## Simple feature collection with 35 features and 10 fields
## geometry type: POLYGON
## dimension: XY
## bbox: xmin: -80.1 ymin: 40.4 xmax: -79.9 ymax: 40.5
## epsg (SRID): 4326
## proj4string: +proj=longlat +datum=WGS84 +no_defs
## First 10 features:
## objectid wards_ wards_id ward wardtext Shape__Are Shape__Len n
## 1 2 3 0 19 19 0.0011446 0.2389 27037
## 2 3 4 0 24 24 0.0001901 0.0734 5294
## 3 4 5 0 24 24 0.0000210 0.0274 5294
## 4 5 6 0 20 20 0.0011755 0.2888 17194
## 5 6 7 0 21 21 0.0002074 0.0619 3793
## 6 7 8 0 23 23 0.0001031 0.0480 4985
## 7 8 9 0 22 22 0.0001967 0.0593 4591
## 8 9 10 0 26 26 0.0009079 0.1915 10937
## 9 10 11 0 27 27 0.0005849 0.1173 10051
## 10 11 12 0 27 27 0.0000634 0.0400 10051
## geometry x y
## 1 POLYGON ((-80 40.4, -80 40.... -80 40.4
## 2 POLYGON ((-80 40.5, -80 40.... -80 40.5
## 3 POLYGON ((-80 40.5, -80 40.... -80 40.5
## 4 POLYGON ((-80 40.4, -80 40.... -80 40.4
## 5 POLYGON ((-80 40.5, -80 40.... -80 40.5
## 6 POLYGON ((-80 40.5, -80 40.... -80 40.5
## 7 POLYGON ((-80 40.5, -80 40.... -80 40.5
## 8 POLYGON ((-80 40.5, -80 40.... -80 40.5
## 9 POLYGON ((-80 40.5, -80 40.... -80 40.5
## 10 POLYGON ((-80 40.5, -80 40.... -80 40.5
Plot the data:
ward_311 %>%
ggplot() +
geom_sf(aes(fill = n), color = NA) +
geom_label(aes(x, y, label = ward), size = 3) +
scale_fill_viridis_c("Number of 311 requests")
We can use ggmap to request a basemap from the Google Maps API. Get your API key here
register_google(key = "Your key here")
pgh_map <- get_map(location = "Pittsburgh, PA", zoom = 12)
ggmap(pgh_map)
There are multiple basemap styles available:
get_map(location = "Pittsburgh, PA", zoom = 12, maptype = "satellite", source = "google") %>%
ggmap()
get_map(location = "Pittsburgh, PA", zoom = 12, maptype = "roadmap", source = "google") %>%
ggmap()
get_map(location = "Pittsburgh, PA", zoom = 12, maptype = "watercolor", source = "google") %>%
ggmap()
get_map(location = "Pittsburgh, PA", zoom = 12, maptype = "toner", source = "stamen") %>%
ggmap()
Combining maps from different systems requires us to use the same map projection. Use coord_sf to set the projection:
ggmap(pgh_map) +
geom_sf(data = ward_311, aes(fill = n), inherit.aes = FALSE, color = NA, alpha = .7) +
geom_label(data = ward_311, aes(x, y, label = ward), size = 3) +
coord_sf(crs = st_crs(4326)) +
scale_fill_viridis_c("Number of 311 requests")